import pandas as pd
import statsmodels.formula.api as smf
import numpy as np

# Load data
df = pd.read_csv('cleaned_with_male_head.csv')

# Group regions as in Table 6
df['region_grouped'] = df['region']
df.loc[df['region_grouped'].isin(['Northern Midlands and Mountains', 'Northwest']), 'region_grouped'] = 'Northern Midlands and Mountainous Area'
df.loc[df['region_grouped'].isin(['North Central Coast', 'South Central Coast']), 'region_grouped'] = 'North Central & Central Coastal Area'
df['region_grouped'] = df['region_grouped'].astype('category')

# Categorical vars
df['CHILD'] = df['CHILD'].astype('category')
df['H_TYPE'] = df['H_TYPE'].astype('category')
df['year'] = df['year'].astype('category')

# Demean function
def demean(df, vars_to_demean, group_col='code_id_hh'):
    df_demean = df.copy()
    for var in vars_to_demean:
        if var in df_demean.columns:
            group_mean = df_demean.groupby(group_col)[var].transform('mean')
            df_demean[var] = df_demean[var] - group_mean
    return df_demean

vars_to_demean = ['fert', 'htype', 'mar', 'emp', 'pri', 'sec', 'high', 'inc', 'fam']
df_demean = demean(df, vars_to_demean)

# Function for CI
def get_param_info(model, param):
    coeff = model.params.get(param, np.nan)
    se = model.bse.get(param, np.nan)
    if np.isnan(coeff):
        return "N/A"
    ci_low = coeff - 1.96 * se
    ci_high = coeff + 1.96 * se
    return f"{round(coeff, 4)} ({round(ci_low, 4)} to {round(ci_high, 4)})"

# Binary model with region FE
formula_bin_reg = 'fert ~ htype + C(CHILD) + mar + emp + pri + sec + high + inc + fam + C(year) + C(region_grouped)'
model_bin_reg = smf.ols(formula_bin_reg, data=df_demean).fit(cov_type='cluster', cov_kwds={'groups': df_demean['code_id_hh']})

# Binary with interaction
formula_bin_int = 'fert ~ htype * C(region_grouped) + C(CHILD) + mar + emp + pri + sec + high + inc + fam + C(year)'
model_bin_int = smf.ols(formula_bin_int, data=df_demean).fit(cov_type='cluster', cov_kwds={'groups': df_demean['code_id_hh']})


# Build Table 5b (binary with interaction, focus on key terms)
table5b_data = {
    'Variable': [
        'Living with grandparent(s)^a',
        'Interaction terms (htype * region)^c:',
        '- Northern Midlands and Mountainous Area',
        '- North Central & Central Coastal Area',
        '- Central Highlands',
        '- Mekong River Delta',
        'Observations',
        'Households',
        'Individual FE',
        'Year FE',
        'R-squared'
    ],
    'No. of children under 3': [
        get_param_info(model_bin_int, 'htype'),
        '',
        get_param_info(model_bin_int, 'htype:C(region_grouped)[T.Northern Midlands and Mountainous Area]'),
        get_param_info(model_bin_int, 'htype:C(region_grouped)[T.North Central & Central Coastal Area]'),
        get_param_info(model_bin_int, 'htype:C(region_grouped)[T.Central Highlands]'),
        get_param_info(model_bin_int, 'htype:C(region_grouped)[T.Mekong River Delta]'),
        int(model_bin_int.nobs),
        df['code_id_hh'].nunique(),
        'Yes',
        'Yes',
        round(model_bin_int.rsquared, 3)
    ]
}
table5b = pd.DataFrame(table5b_data)
print("\nTable 5b: Binary Model with Region Interaction\n", table5b)
table5b.to_csv('revise_table_5b_region.csv', index=False)

